We are going to use:
library(tidyverse) # data manipulation and visualization
library(plotly) # plots in 3D
library(ggplot2) # plots in 2D
library(ggpubr) # to combine multiple ggplot objects (ggarrenge)
library(mvtnorm) # to generate multivariate normal distribution
library(dr) # SIR
library(factoextra) # PCA-related functions
Let’s first define a function to generate Gaussian data. This function takes four arguments:
generateGaussianData <- function(n, center, sigma, label) {
data = rmvnorm(n, center, sigma)
data = data.frame(data)
names(data) = c("x", "y", "z")
data = data %>% mutate(class=factor(label))
data
}
Now let’s simulate a dataset.
covmat <- matrix(c(1,0.88,0.88,0.88, 1,0.88,0.88,0.88, 1),
nrow = 3, byrow=T)
# cluster 1
n = 200
center = c(2, 8, 6)
sigma = covmat
group1 = generateGaussianData(n, center, sigma, 1)
# cluster 2
n = 200
center = c(4, 8, 6)
sigma = covmat
group2 = generateGaussianData(n, center, sigma, 2)
# cluster 3
n = 200
center = c(6, 8, 6)
sigma = covmat
group3 = generateGaussianData(n, center, sigma, 3)
# all data
df = bind_rows(group1, group2, group3)
head(df)
## x y z class
## 1 1.4857568 8.016398 6.031452 1
## 2 0.3278638 6.522821 4.580953 1
## 3 1.3782497 7.458459 6.226041 1
## 4 2.6557762 8.763941 6.405316 1
## 5 2.2629447 8.175069 6.990975 1
## 6 2.8397608 8.713051 6.445625 1
summary(df)
## x y z class
## Min. :-0.1998 Min. : 5.069 Min. :2.125 1:200
## 1st Qu.: 2.5501 1st Qu.: 7.375 1st Qu.:5.346 2:200
## Median : 3.9791 Median : 8.008 Median :5.994 3:200
## Mean : 4.0333 Mean : 8.028 Mean :6.033
## 3rd Qu.: 5.5042 3rd Qu.: 8.720 3rd Qu.:6.694
## Max. : 8.3713 Max. :10.978 Max. :9.140
And plot our simulated data.
fig <- plot_ly(df, x = ~x, y = ~y, z = ~z,
color = ~class, colors = c('#b3e378', '#81e5f0', '#ed5391'))
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'x'),
yaxis = list(title = 'y'),
zaxis = list(title = 'z')))
fig
Let’s perform LDA:
lda.df <- lda(factor(class) ~ x + y + z, data = df)
lda.df
## Call:
## lda(factor(class) ~ x + y + z, data = df)
##
## Prior probabilities of groups:
## 1 2 3
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## x y z
## 1 2.093123 8.078746 6.081409
## 2 3.936263 7.937088 5.959751
## 3 6.070590 8.067554 6.056851
##
## Coefficients of linear discriminants:
## LD1 LD2
## x 2.362772 0.0008242327
## y -1.133210 1.3646797330
## z -1.114203 -0.4343873891
##
## Proportion of trace:
## LD1 LD2
## 0.9997 0.0003
Let us plot the projections on LD1 and LD2
# prediction on df to get projections
predmodel.lda = predict(lda.df, data=df)
# projections with LDA classes
estclass <- as.factor(apply(predmodel.lda$posterior, 1, which.max))
newdata2 <- data.frame(type = estclass, lda = predmodel.lda$x)
p1 <- ggplot(newdata2) +
geom_point(aes(lda.LD1, lda.LD2, colour = type), size = 2.5) +
ggtitle("projections with LDA classes")
# projections with true classes
newdata <- data.frame(type = df$class, lda = predmodel.lda$x)
p2 <- ggplot(newdata) +
geom_point(aes(lda.LD1, lda.LD2, colour = type), size = 2.5) +
ggtitle("projections with true classes")
ggarrange(p1,p2,
nrow=2)
Now let us perform PCA.
pc <- prcomp(df[,c(1,2,3)])
get_eig(pc)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 4.3887333 76.074162 76.07416
## Dim.2 1.2629284 21.891560 97.96572
## Dim.3 0.1173579 2.034277 100.00000
This is the corresponding biplot.
fviz_pca_biplot(pc, col.var= "#2E9FDF",
col.ind= df$class, label="var")
Note that just considering the first principal component it is impossibile to notice differences within the three groups (all groups are overlapping).
Now we use the SIR (Sliced Inversion Regression) in the dr package
# default fitting method is "sir"
help(dr)
dr_res <- dr(class ~ x + y + z, data = df, method='sir')
dr_res
##
## dr(formula = class ~ x + y + z, data = df, method = "sir")
## Estimated Basis Vectors for Central Subspace:
## Dir1 Dir2 Dir3
## x 0.8297682 -0.0005755226 -0.003033721
## y -0.3979656 -0.9528910951 0.628512122
## z -0.3912904 0.3033120994 -0.777793873
## Eigenvalues:
## [1] 9.371909e-01 4.253111e-03 6.278830e-18
plot(dr_res, col=df$class)
names(dr_res)
## [1] "x" "y" "weights" "method" "cases"
## [6] "qr" "group" "chi2approx" "evectors" "evalues"
## [11] "numdir" "raw.evectors" "M" "slice.info" "call"
## [16] "y.name" "terms"